Startup
library(tidyverse)
library(cowplot)
library(ggpmisc)
source("data_import_functions.R")
source("calculation_functions.R")
source("figure_format_functions.R")
all_hla_expanded <- readRDS("all_hla_expanded.RDS")
isb_path <- "/labs/khatrilab/solomonb/covid/isb"
isb_samples <- read_tsv("/labs/khatrilab/solomonb/covid/isb/logs/210217_232725/parallel.log") %>%
separate(Command, into = c(NA, "sample"), sep = " ") %>%
pull(sample) %>% unique()
Get reference alleles
reference_alleles <- read_tsv("GCh38_reference_alleles.txt", col_names = F) %>% pull(1)
genome_region_reference <- get_allele_length(reference_alleles)
Get arcasHLA alignment stats
arcas_log_dir <- sprintf("%s/arcasHLA", isb_path)
alignment_stats_df <- hla_mapping_stats_import(isb_samples, arcas_log_dir)
Import cell counts
cell_counts_df <- readRDS("isb_sequenced_cell_count.RDS")
Assemble accuracy DF
success_df <- readRDS("isb_success.RDS")
accuracy_df <- readRDS("isb_accuracy_drb345_filtered.RDS")
accuracy_df <- accuracy_df %>%
left_join(
success_df %>% select(sample:field, genotyper, success = accuracy),
by = c("sample", "locus", "field", "genotyper")
)%>%
left_join(alignment_stats_df, by = c("sample", "locus")) %>%
left_join(genome_region_reference, by = "locus") %>%
left_join(cell_counts_df, by = "sample") %>%
mutate_at(vars(c(reads, bp)), as.numeric) %>%
mutate(coverage = (reads * 91)/bp) %>%
mutate(n_alleles = map_dbl(allele, function(x) sum(!is.na(x)))) %>%
filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)$", sample) & genotyper != "hlaminer")
Coverage based plots
gg_coverage(accuracy_df, x_var = "coverage", y_var = "accuracy")
`geom_smooth()` using formula 'y ~ x'

gg_coverage(accuracy_df, x_var = "coverage", y_var = "success")
`geom_smooth()` using formula 'y ~ x'

gg_coverage(accuracy_df, x_var = "coverage", y_var = "n_alleles", facet_genotyper = T)

Cell number based plots
gg_coverage(accuracy_df, x_var = "n_cells", y_var = "accuracy")
`geom_smooth()` using formula 'y ~ x'

gg_coverage(accuracy_df, x_var = "n_cells", y_var = "success")
`geom_smooth()` using formula 'y ~ x'

gg_coverage(accuracy_df, x_var = "n_cells", y_var = "n_alleles", facet_genotyper = T)

Subsample READS
Import read subsample data
# Import predicted genotypes
subsample_reads_path <- "/labs/khatrilab/solomonb/covid/isb/subsample"
subsample_reads_samples <- tibble(sample = list.files(sprintf("%s/arcasHLA", subsample_reads_path))) %>%
filter(grepl("^INCOV", sample)) %>%
separate(sample, into = c("sample"), sep = "\\.", extra = "drop") %>%
distinct() %>%
pull(sample)
subsample_reads_df <- format_hla_table(combine_HLA_import(path = subsample_reads_path, samples = subsample_reads_samples))
subsample_reads_df <- subsample_reads_df %>%
filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)_", sample))
subsample_reads_df
# Expand invitro genotypes across read subsample names
invitro_df <- invitro_import()
Rows: 1557 Columns: 9
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (8): Sample ID, Locus, Allele 1, Allele 2, Comments, Diploid Ambiguities, Allele 1 Ambiguities, Allele 2 Ambiguities
dbl (1): index
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
invitro_reads_df <- subsample_reads_df %>%
select(sample) %>%
distinct() %>%
separate(sample, into = c("sample", "subsample"), sep = "_") %>%
left_join(invitro_df, by = "sample") %>%
unite(sample, sample, subsample, sep = "_") %>%
format_hla_table() %>%
drop_na()
invitro_reads_df
# Import read statistics
subsample_reads_arcas_log_dir <- sprintf("%s/arcasHLA", subsample_reads_path)
subsample_reads_alignment_stats_df <- hla_mapping_stats_import(subsample_reads_samples, subsample_reads_arcas_log_dir)
Calculate accuracy
# Calculate accuracy
subsample_reads_accuracy_df <- compare_hla(
hla_df = bind_rows(
subsample_reads_df,
invitro_reads_df
),
reference = "invitro",
method = "accuracy"
)
# saveRDS(subsample_reads_accuracy_df, "isb_subsample_reads_accuracy.RDS")
# Calculate success
subsample_reads_success_df <- compare_hla(
hla_df = bind_rows(
subsample_reads_df,
invitro_reads_df
),
reference = "invitro",
method = "success"
)
# saveRDS(subsample_reads_success_df, "isb_subsample_reads_success.RDS")
# Combine various data
subsample_reads_combined_df <- subsample_reads_accuracy_df %>%
left_join(
subsample_reads_success_df %>% select(sample:field, genotyper, success = accuracy),
by = c("sample", "locus", "field", "genotyper")
)%>%
left_join(subsample_reads_alignment_stats_df, by = c("sample", "locus")) %>%
left_join(genome_region_reference, by = "locus") %>%
bind_rows(accuracy_df %>% select(-n_cells)) %>% # No cell numbers for read subsamples
mutate_at(vars(c(reads, bp)), as.numeric) %>%
mutate(coverage = (reads * 91)/bp) %>% # Calculate coverage
mutate(n_alleles = map_dbl(allele, function(x) sum(!is.na(x)))) %>% # Calculate n_alleles
filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)", sample) & genotyper != "hlaminer")
Reads plots
gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "accuracy")
`geom_smooth()` using formula 'y ~ x'

plt_read_accuracy_all <- gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "accuracy")
plt_read_accuracy_abc <- gg_coverage(subsample_reads_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "reads", y_var = "accuracy")
gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "success")
`geom_smooth()` using formula 'y ~ x'

plt_read_success_all <- gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "success")
plt_read_success_abc <- gg_coverage(subsample_reads_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "reads", y_var = "success")
gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "n_alleles", facet_genotyper = T)

plt_read_allele_all <- gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "n_alleles", facet_genotyper = T)
plt_read_allele_abc <- gg_coverage(subsample_reads_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "reads", y_var = "n_alleles", facet_genotyper = T)
- Default arcasHLA cut off 75 reads (log10 = 1.8)
subsample_reads_combined_df %>%
filter(field == "field_2") %>%
mutate(n_alleles = factor(n_alleles, levels = 0:2)) %>%
ggplot(aes(y = n_alleles, x = log10(coverage)))+
# geom_violin(scale = "count", alpha = 0.5, )+
geom_jitter(size = 0.2, width = 0, height =0.2, alpha = 0.2)+
# geom_boxplot(width=0.1)+
# ggbeeswarm::geom_beeswarm(size = 0.1, alpha = 0.1)+
# coord_flip()+
facet_grid(genotyper ~ locus, scales = "free")+
theme_bw()

Subsample CELLS
# Import predicted genotypes
subsample_cells_path <- "/labs/khatrilab/solomonb/covid/isb/subsample_cells"
subsample_cells_samples <- tibble(sample = list.files(sprintf("%s/arcasHLA", subsample_cells_path))) %>%
filter(grepl("^INCOV", sample)) %>%
separate(sample, into = c("sample"), sep = "\\.", extra = "drop") %>%
distinct() %>%
pull(sample)
subsample_cells_df <- format_hla_table(combine_HLA_import(path = subsample_cells_path, samples = subsample_cells_samples))
subsample_cells_df <- subsample_cells_df %>%
filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)_", sample))
subsample_cells_df
# Expand invitro genotypes across read subsample names
invitro_df <- invitro_import()
Rows: 1557 Columns: 9
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (8): Sample ID, Locus, Allele 1, Allele 2, Comments, Diploid Ambiguities, Allele 1 Ambiguities, Allele 2 Ambiguities
dbl (1): index
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
invitro_cells_df <- subsample_cells_df %>%
select(sample) %>%
distinct() %>%
separate(sample, into = c("sample", "subsample"), sep = "_") %>%
left_join(invitro_df, by = "sample") %>%
unite(sample, sample, subsample, sep = "_") %>%
format_hla_table() %>%
drop_na()
invitro_cells_df
# Import read statistics
subsample_cells_arcas_log_dir <- sprintf("%s/arcasHLA", subsample_cells_path)
subsample_cells_alignment_stats_df <- hla_mapping_stats_import(subsample_cells_samples, subsample_cells_arcas_log_dir)
# Import cell counts for each subsample
subsample_cell_counts_df <- tibble(line = system("wc -l /labs/khatrilab/solomonb/covid/isb/subsample_cells/barcodes/INCOV*", intern = T)) %>%
separate(line, into = c("n_cells", "file"), sep = " /") %>%
drop_na() %>%
mutate(sample = gsub("\\..*","",basename(file))) %>%
mutate(n_cells = as.numeric(n_cells)) %>%
select(-file) %>%
# separate(sample, into = c("sample", "subset"), sep = "_") %>%
bind_rows(cell_counts_df)
Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [883].
subsample_cell_counts_df
Calculate accuracy
# Calculate accuracy
subsample_cells_accuracy_df <- compare_hla(
hla_df = bind_rows(
subsample_cells_df,
invitro_cells_df
),
reference = "invitro",
method = "accuracy"
)
# saveRDS(subsample_cells_accuracy_df, "isb_subsample_cells_accuracy.RDS")
# Calculate success
subsample_cells_success_df <- compare_hla(
hla_df = bind_rows(
subsample_cells_df,
invitro_cells_df
),
reference = "invitro",
method = "success"
)
# saveRDS(subsample_cells_success_df, "isb_subsample_cells_success.RDS")
# Combine various data
subsample_cells_combined_df <- subsample_cells_accuracy_df %>%
left_join(
subsample_cells_success_df %>% select(sample:field, genotyper, success = accuracy),
by = c("sample", "locus", "field", "genotyper")
)%>%
left_join(subsample_cells_alignment_stats_df, by = c("sample", "locus")) %>%
left_join(genome_region_reference, by = "locus") %>%
left_join(subsample_cell_counts_df, by = "sample") %>%
bind_rows(accuracy_df) %>%
mutate_at(vars(c(reads, bp)), as.numeric) %>%
mutate(coverage = (reads * 91)/bp) %>%
mutate(n_alleles = map_dbl(allele, function(x) sum(!is.na(x)))) %>%
filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)", sample) & genotyper != "hlaminer")
Coverage plots
gg_coverage(subsample_cells_combined_df, x_var = "coverage", y_var = "accuracy")
`geom_smooth()` using formula 'y ~ x'

gg_coverage(subsample_cells_combined_df, x_var = "coverage", y_var = "success")
`geom_smooth()` using formula 'y ~ x'

gg_coverage(subsample_cells_combined_df, x_var = "coverage", y_var = "n_alleles", facet_genotyper = T)

Cell number plots
gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "accuracy")
`geom_smooth()` using formula 'y ~ x'

plt_cell_accuracy_all <- gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "accuracy")
plt_cell_accuracy_abc <- gg_coverage(subsample_cells_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "n_cells", y_var = "accuracy")
gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "success")
`geom_smooth()` using formula 'y ~ x'

plt_cell_success_all <- gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "success")
plt_cell_success_abc <- gg_coverage(subsample_cells_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "n_cells", y_var = "success")
gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "n_alleles", facet_genotyper = T)

plt_cell_allele_all <- gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "n_alleles", facet_genotyper = T)
plt_cell_allele_abc <- gg_coverage(subsample_cells_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "n_cells", y_var = "n_alleles", facet_genotyper = T)
# subsample_cells_combined_df %>%
# mutate(reads = reads/n_cells) %>%
# filter(reads < 100) %>%
# gg_coverage(x_var = "reads", y_var = "accuracy", x_log = F)
Combined plots
no_leg <- list(theme(legend.position = "none"))
legend <- cowplot::get_legend(plt_read_accuracy_abc)
`geom_smooth()` using formula 'y ~ x'
plot_grid(
plot_grid(
plt_read_accuracy_abc + no_leg,
plt_read_success_abc + no_leg,
plt_read_allele_abc,
ncol = 1,
align = "v",
axis = "lr",
rel_heights = c(1,1,1.2),
labels = LETTERS[1:3]
),
plot_grid(
plt_cell_accuracy_abc + no_leg,
plt_cell_success_abc + no_leg,
plt_cell_allele_abc,
ncol = 1,
align = "v",
axis = "lr",
rel_heights = c(1,1,1.2),
labels = LETTERS[4:6]
),
plot_grid(
legend,
legend,
NA,
ncol = 1
),
nrow = 1,
rel_widths = c(1,1,0.25)
)
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'

no_leg <- list(theme(legend.position = "none"))
legend <- cowplot::get_legend(plt_read_accuracy_all)
`geom_smooth()` using formula 'y ~ x'
plot_grid(
plot_grid(
plt_read_accuracy_all + no_leg,
plt_read_success_all + no_leg,
plt_read_allele_all,
ncol = 1,
align = "v",
axis = "lr",
rel_heights = c(1,1,1.2),
labels = LETTERS[1:3]
),
plot_grid(
plt_cell_accuracy_all + no_leg,
plt_cell_success_all + no_leg,
plt_cell_allele_all,
ncol = 1,
align = "v",
axis = "lr",
rel_heights = c(1,1,1.2),
labels = LETTERS[4:6]
),
plot_grid(
legend,
legend,
NA,
ncol = 1
),
nrow = 1,
rel_widths = c(1,1,0.25)
)
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'

Slope stats
get_lm_stats <- function(df, x_var = "coverage", y_var = "accuracy", field_var = "field_2", x_log = T){
# Input checks
arg_col <- makeAssertCollection()
assertChoice(y_var, c("accuracy", "success", "n_alleles"), add = arg_col)
assertChoice(x_var, c("reads", "coverage", "n_cells"), add = arg_col)
assertChoice(field_var, c("field_1", "field_2", "field_3"), add = arg_col)
assertLogical(x_log, add = arg_col)
if (arg_col$isEmpty()==F) {map(arg_col$getMessages(),print);reportAssertions(arg_col)}
# browser()
df <- df %>%
filter(field == field_var) %>%
remove_invalid_combinations()
if (x_log ==T){df <- df %>% mutate_at(x_var, log10)}
df %>%
filter_at(x_var, function(x) !is.na(x) & !is.infinite(x)) %>%
group_by(locus, field, genotyper) %>%
nest() %>%
mutate(data = map(data, function(x){
summary(lm(!!sym(y_var) ~ !!sym(x_var), data = x)) %>% broom::tidy() %>% filter(term == x_var)
})) %>% unnest_wider(data) %>%
mutate(est_min = round(estimate - 2*std.error, 2),
est_max = round(estimate + 2*std.error, 2))
}
model_stats <- get_lm_stats(subsample_cells_combined_df, x_var = "reads", y_var = "accuracy", x_log = T)
model_stats
model_stats <- bind_rows(
get_lm_stats(subsample_reads_combined_df, x_var = "reads", y_var = "accuracy", x_log = T) %>% mutate(x_var = "reads", y_var = "accuracy"),
get_lm_stats(subsample_reads_combined_df, x_var = "reads", y_var = "success", x_log = T) %>% mutate(x_var = "reads", y_var = "success"),
get_lm_stats(subsample_cells_combined_df, x_var = "n_cells", y_var = "accuracy", x_log = T) %>% mutate(x_var = "cells", y_var = "accuracy"),
get_lm_stats(subsample_cells_combined_df, x_var = "n_cells", y_var = "success", x_log = T) %>% mutate(x_var = "cells", y_var = "success")
) %>%
# filter(grepl("^[ABC]", locus)) %>%
ungroup() %>%
filter(field == "field_2")
Table
col_vars <- c("x_var", "locus")
row_vars <- c("y_var", "genotyper")
flex_df <- model_stats %>%
# filter(grepl("^[ABC]", locus)) %>%
ungroup() %>%
filter(field == "field_2") %>%
mutate(CI = sprintf("(%s) - (%s)", est_min, est_max)) %>%
mutate(genotyper = reformat_hla_genotyper(genotyper)) %>%
select(genotyper, locus, CI, x_var, y_var) %>%
pivot_wider(names_from = col_vars, values_from = "CI", names_sep = "-")
Note: Using an external vector in selections is ambiguous.
ℹ Use `all_of(col_vars)` instead of `col_vars` to silence this message.
ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
This message is displayed once per session.
df_key <- tibble(col_keys = names(flex_df)) %>%
filter(!(col_keys %in% c(col_vars, row_vars))) %>%
separate(col_keys, into = col_vars, sep = "-", remove = F) %>%
mutate_at(vars(contains("genotyper")), reformat_hla_genotyper) %>%
arrange_at(col_vars) %>%
mutate_all(as.character) %>%
mutate_at(col_vars[!(col_vars %in% "locus")], str_to_sentence)
flex_df %>%
select(row_vars, everything()) %>%
mutate_at(row_vars[!(row_vars %in% "genotyper")], str_to_sentence) %>%
# select(locus, df_key$col_keys) %>%
flextable() %>%
colformat_char(na_str = "---") %>%
merge_v(j=1:2) %>%
set_header_df(mapping = df_key, key = "col_keys") %>%
merge_h(part = "header") %>%
theme_vanilla() %>%
# vline(j=vlines_sequence, border = fp_border_default()) %>%
fix_border_issues() %>%
autofit() %>%
fontsize(size = 8, part = "all") %>%
print(preview = "pptx")
Note: Using an external vector in selections is ambiguous.
ℹ Use `all_of(row_vars)` instead of `row_vars` to silence this message.
ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
This message is displayed once per session.
df <- df %>%
mutate(field = reformat_hla_field(field)) %>%
mutate_at(vars(contains("genotyper")), reformat_hla_genotyper) %>%
mutate(cell_value = ifelse(!is.na(se),
sprintf("%s %s %s", round(mean_accuracy,2),"\u00B1",round(se,2)),
sprintf("%s", round(mean_accuracy,2)))) %>%
mutate(cell_value = ifelse(grepl("NA", cell_value), NA, cell_value)) %>%
select(-sd,-se, -mean_accuracy) %>%
pivot_wider(names_from = nesting_vars, values_from = "cell_value", names_sep = "-")
Error: Problem with `mutate()` column `field`.
ℹ `field = reformat_hla_field(field)`.
x object 'field' not found
Run `rlang::last_error()` to see where the error occurred.
Scratch work
subsample_cells_combined_df %>%
filter(field == "field_2") %>%
mutate(n_alleles = factor(n_alleles, levels = 0:2)) %>%
ggplot(aes(x = n_alleles, y = log10(n_cells)))+
geom_violin(scale = "width")+
geom_jitter(size = 0.2, height = 0, width =0.05, alpha = 0.2)+
# geom_boxplot(width=0.1)+
# ggbeeswarm::geom_beeswarm(size = 0.1, alpha = 0.1)+
coord_flip()+
facet_grid(genotyper ~ locus, scales = "free")+
theme_bw()
Coverage stats
subsample_cells_combined_df %>%
ggplot(aes(x = n_cells, y = coverage, color = genotyper)) +
geom_point() +
facet_wrap(genotyper ~ locus, scales = "free", nrow = 3) +
theme_bw() +
theme(strip.text = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1))+
scale_color_brewer(palette = "Dark2")
subsample_cells_combined_df %>%
select(-genotyper) %>% distinct() %>%
mutate(cov_per_cell = coverage/n_cells) %>%
ggplot(aes(x = log10(n_cells), y = cov_per_cell)) +
geom_point(color = "black", alpha = 0.1) +
stat_smooth(method = "lm")+
# stat_smooth()+
facet_wrap( ~ locus, scales = "free", nrow = 1) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
scale_color_brewer(palette = "Dark2")
subsample_cells_combined_df %>%
select(-genotyper) %>% distinct() %>%
filter(!is.na(n_cells)) %>%
mutate(cov_per_cell = coverage/n_cells) %>%
mutate(n_cells = cut(n_cells, seq(0,10000,by=2500), right = F)) %>%
ggplot(aes(x = n_cells, y = cov_per_cell)) +
stat_summary(fun = function(x) mean(x,na.rm=T), geom = "bar") +
# geom_point(color = "black", alpha = 0.1) +
# stat_smooth(method = "lm")+
# stat_smooth()+
facet_wrap( ~ locus, scales = "free", nrow = 1) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
scale_color_brewer(palette = "Dark2")
hist(cut(1:100, seq(1,100,10)))
subsample_cells_combined_df %>%
ungroup() %>%
count(genotyper, locus, field)
subsample_reads_combined_df %>%
ungroup() %>%
count(genotyper, locus, field)
---
title: "3) Coverage Analysis"
output: 
  html_notebook:
    toc: true
    toc_depth: 3
    toc_float: true
    code_folding: hide
---


# Startup 

```{r, message = F}
library(tidyverse)
library(cowplot)
library(ggpmisc)

source("data_import_functions.R")
source("calculation_functions.R")
source("figure_format_functions.R")
all_hla_expanded <- readRDS("all_hla_expanded.RDS")
isb_path <- "/labs/khatrilab/solomonb/covid/isb"
isb_samples <- read_tsv("/labs/khatrilab/solomonb/covid/isb/logs/210217_232725/parallel.log") %>% 
  separate(Command, into = c(NA, "sample"), sep = " ") %>% 
  pull(sample) %>% unique()
```

### Get reference alleles

- GCh38 reference alleles (per https://www.ebi.ac.uk/ipd/imgt/hla/help/genomics.html)
- To be used to calculate coverage

```{r, message = F}
reference_alleles <- read_tsv("GCh38_reference_alleles.txt", col_names = F) %>% pull(1)
genome_region_reference <- get_allele_length(reference_alleles)
```

### Get arcasHLA alignment stats

```{r}
arcas_log_dir <- sprintf("%s/arcasHLA", isb_path)
alignment_stats_df <- hla_mapping_stats_import(isb_samples, arcas_log_dir)
```

### Import cell counts

```{r}
cell_counts_df <- readRDS("isb_sequenced_cell_count.RDS")
```

### Assemble accuracy DF

```{r}
success_df <- readRDS("isb_success.RDS")
accuracy_df <- readRDS("isb_accuracy_drb345_filtered.RDS")
accuracy_df <- accuracy_df %>% 
  left_join(
    success_df %>% select(sample:field, genotyper, success = accuracy),
    by = c("sample", "locus", "field", "genotyper")
  )%>% 
  left_join(alignment_stats_df, by = c("sample", "locus")) %>% 
  left_join(genome_region_reference, by = "locus") %>%
  left_join(cell_counts_df, by = "sample") %>%
  mutate_at(vars(c(reads, bp)), as.numeric) %>% 
  mutate(coverage = (reads * 91)/bp) %>% 
  mutate(n_alleles = map_dbl(allele, function(x) sum(!is.na(x)))) %>% 
  filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)$", sample) & genotyper != "hlaminer") 
```


### Coverage based plots

```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(accuracy_df, x_var = "coverage", y_var = "accuracy") 
```
```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(accuracy_df, x_var = "coverage", y_var = "success")
```
```{r, fig.width=12, fig.height=3, warning = F, message = F}
gg_coverage(accuracy_df, x_var = "coverage", y_var = "n_alleles", facet_genotyper = T)
```

### Cell number based plots

```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(accuracy_df, x_var = "n_cells", y_var = "accuracy") 
```
```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(accuracy_df, x_var = "n_cells", y_var = "success")
```
```{r, fig.width=12, fig.height=3, warning = F, message = F}
gg_coverage(accuracy_df, x_var = "n_cells", y_var = "n_alleles", facet_genotyper = T)
```


# Subsample READS

### Import read subsample data

```{r}
# Import predicted genotypes
subsample_reads_path <- "/labs/khatrilab/solomonb/covid/isb/subsample"
subsample_reads_samples <- tibble(sample = list.files(sprintf("%s/arcasHLA", subsample_reads_path))) %>% 
  filter(grepl("^INCOV", sample)) %>% 
  separate(sample, into = c("sample"), sep = "\\.", extra = "drop") %>% 
  distinct() %>% 
  pull(sample)
subsample_reads_df <- format_hla_table(combine_HLA_import(path = subsample_reads_path, samples = subsample_reads_samples))
subsample_reads_df <- subsample_reads_df %>% 
  filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)_", sample))
subsample_reads_df
```

```{r, message = F}
# Expand invitro genotypes across read subsample names
invitro_df <- invitro_import()
invitro_reads_df <- subsample_reads_df %>% 
  select(sample) %>% 
  distinct() %>% 
  separate(sample, into = c("sample", "subsample"), sep = "_") %>% 
  left_join(invitro_df, by = "sample") %>% 
  unite(sample, sample, subsample, sep = "_") %>% 
  format_hla_table() %>% 
  drop_na()
invitro_reads_df
```

```{r}
# Import read statistics
subsample_reads_arcas_log_dir <- sprintf("%s/arcasHLA", subsample_reads_path)
subsample_reads_alignment_stats_df <- hla_mapping_stats_import(subsample_reads_samples, subsample_reads_arcas_log_dir)
```

### Calculate accuracy

```{r}
# Calculate accuracy
subsample_reads_accuracy_df <- compare_hla(
  hla_df = bind_rows(
    subsample_reads_df,
    invitro_reads_df
  ),
  reference = "invitro", 
  method = "accuracy"
)
# saveRDS(subsample_reads_accuracy_df, "isb_subsample_reads_accuracy.RDS")
```


```{r}
# Calculate success
subsample_reads_success_df <- compare_hla(
  hla_df = bind_rows(
    subsample_reads_df,
    invitro_reads_df
  ),
  reference = "invitro", 
  method = "success"
)
# saveRDS(subsample_reads_success_df, "isb_subsample_reads_success.RDS")
```

```{r}
# Combine various data
subsample_reads_combined_df <- subsample_reads_accuracy_df %>% 
  left_join(
    subsample_reads_success_df %>% select(sample:field, genotyper, success = accuracy),
    by = c("sample", "locus", "field", "genotyper")
  )%>% 
  left_join(subsample_reads_alignment_stats_df, by = c("sample", "locus")) %>% 
  left_join(genome_region_reference, by = "locus") %>%
  bind_rows(accuracy_df %>% select(-n_cells)) %>% # No cell numbers for read subsamples
  mutate_at(vars(c(reads, bp)), as.numeric) %>% 
  mutate(coverage = (reads * 91)/bp) %>%  # Calculate coverage
  mutate(n_alleles = map_dbl(allele, function(x) sum(!is.na(x)))) %>%  # Calculate n_alleles
  filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)", sample) & genotyper != "hlaminer") 
```

### Reads plots
```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "accuracy") 
plt_read_accuracy_all <- gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "accuracy") 
plt_read_accuracy_abc <- gg_coverage(subsample_reads_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "reads", y_var = "accuracy") 
```
```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "success")
plt_read_success_all <- gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "success")
plt_read_success_abc <- gg_coverage(subsample_reads_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "reads", y_var = "success") 
```
```{r, fig.width=12, fig.height=3, warning = F, message = F}
gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "n_alleles", facet_genotyper = T)
plt_read_allele_all <- gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "n_alleles", facet_genotyper = T)
plt_read_allele_abc <- gg_coverage(subsample_reads_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "reads", y_var = "n_alleles", facet_genotyper = T)
```
- Default arcasHLA cut off 75 reads (log10 = 1.8)

```{r, fig.width=12, fig.height=3, warning = F, message = F}
subsample_reads_combined_df %>% 
  filter(field == "field_2") %>% 
  mutate(n_alleles = factor(n_alleles, levels = 0:2)) %>% 
  ggplot(aes(y = n_alleles, x = log10(coverage)))+
  # geom_violin(scale = "count", alpha = 0.5, )+
  geom_jitter(size = 0.2, width = 0, height =0.2, alpha = 0.2)+
  # geom_boxplot(width=0.1)+
  # ggbeeswarm::geom_beeswarm(size = 0.1, alpha = 0.1)+
  # coord_flip()+
  facet_grid(genotyper ~ locus, scales = "free")+
  theme_bw()
```

# Subsample CELLS

```{r}
# Import predicted genotypes
subsample_cells_path <- "/labs/khatrilab/solomonb/covid/isb/subsample_cells"
subsample_cells_samples <- tibble(sample = list.files(sprintf("%s/arcasHLA", subsample_cells_path))) %>% 
  filter(grepl("^INCOV", sample)) %>% 
  separate(sample, into = c("sample"), sep = "\\.", extra = "drop") %>% 
  distinct() %>% 
  pull(sample)
subsample_cells_df <- format_hla_table(combine_HLA_import(path = subsample_cells_path, samples = subsample_cells_samples))
subsample_cells_df <- subsample_cells_df %>% 
  filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)_", sample))
subsample_cells_df
```


```{r, message = F}
# Expand invitro genotypes across read subsample names
invitro_df <- invitro_import()
invitro_cells_df <- subsample_cells_df %>% 
  select(sample) %>% 
  distinct() %>% 
  separate(sample, into = c("sample", "subsample"), sep = "_") %>% 
  left_join(invitro_df, by = "sample") %>% 
  unite(sample, sample, subsample, sep = "_") %>% 
  format_hla_table() %>% 
  drop_na()
invitro_cells_df
```

```{r}
# Import read statistics
subsample_cells_arcas_log_dir <- sprintf("%s/arcasHLA", subsample_cells_path)
subsample_cells_alignment_stats_df <- hla_mapping_stats_import(subsample_cells_samples, subsample_cells_arcas_log_dir)
```

```{r}
# Import cell counts for each subsample
subsample_cell_counts_df <- tibble(line = system("wc -l /labs/khatrilab/solomonb/covid/isb/subsample_cells/barcodes/INCOV*", intern = T)) %>% 
  separate(line, into = c("n_cells", "file"), sep = " /") %>% 
  drop_na() %>% 
  mutate(sample = gsub("\\..*","",basename(file))) %>% 
  mutate(n_cells = as.numeric(n_cells)) %>% 
  select(-file) %>% 
  # separate(sample, into = c("sample", "subset"), sep = "_") %>% 
  bind_rows(cell_counts_df)
subsample_cell_counts_df
```

### Calculate accuracy

```{r}
# Calculate accuracy
subsample_cells_accuracy_df <- compare_hla(
  hla_df = bind_rows(
    subsample_cells_df,
    invitro_cells_df
  ),
  reference = "invitro", 
  method = "accuracy"
)
# saveRDS(subsample_cells_accuracy_df, "isb_subsample_cells_accuracy.RDS")
```

```{r}
# Calculate success
subsample_cells_success_df <- compare_hla(
  hla_df = bind_rows(
    subsample_cells_df,
    invitro_cells_df
  ),
  reference = "invitro", 
  method = "success"
)
# saveRDS(subsample_cells_success_df, "isb_subsample_cells_success.RDS")
```

```{r}
# Combine various data
subsample_cells_combined_df <- subsample_cells_accuracy_df %>% 
  left_join(
    subsample_cells_success_df %>% select(sample:field, genotyper, success = accuracy),
    by = c("sample", "locus", "field", "genotyper")
  )%>% 
  left_join(subsample_cells_alignment_stats_df, by = c("sample", "locus")) %>% 
  left_join(genome_region_reference, by = "locus") %>%
  left_join(subsample_cell_counts_df, by = "sample") %>%
  bind_rows(accuracy_df) %>% 
  mutate_at(vars(c(reads, bp)), as.numeric) %>% 
  mutate(coverage = (reads * 91)/bp) %>% 
  mutate(n_alleles = map_dbl(allele, function(x) sum(!is.na(x)))) %>% 
  filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)", sample) & genotyper != "hlaminer") 
```

### Coverage plots

```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(subsample_cells_combined_df, x_var = "coverage", y_var = "accuracy") 
```
```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(subsample_cells_combined_df, x_var = "coverage", y_var = "success")
```
```{r, fig.width=12, fig.height=3, warning = F, message = F}
gg_coverage(subsample_cells_combined_df, x_var = "coverage", y_var = "n_alleles", facet_genotyper = T)
```

### Cell number plots

```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "accuracy") 
plt_cell_accuracy_all <- gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "accuracy") 
plt_cell_accuracy_abc <- gg_coverage(subsample_cells_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "n_cells", y_var = "accuracy") 
```
```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "success") 
plt_cell_success_all <- gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "success") 
plt_cell_success_abc <- gg_coverage(subsample_cells_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "n_cells", y_var = "success") 
```
```{r, fig.width=12, fig.height=3, warning = F, message = F}
gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "n_alleles", facet_genotyper = T)
plt_cell_allele_all <- gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "n_alleles", facet_genotyper = T)
plt_cell_allele_abc <- gg_coverage(subsample_cells_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "n_cells", y_var = "n_alleles", facet_genotyper = T)
```


```{r, fig.width=12, fig.height=3, warning = F, message = F}
# subsample_cells_combined_df %>% 
#   mutate(reads = reads/n_cells) %>% 
#   filter(reads < 100) %>% 
#   gg_coverage(x_var = "reads", y_var = "accuracy", x_log = F) 
```








# Combined plots
```{r, fig.width = 9, fig.height = 7, message = F, warning = F}
no_leg <- list(theme(legend.position = "none"))
legend <- cowplot::get_legend(plt_read_accuracy_abc)
plot_grid(
  plot_grid(
    plt_read_accuracy_abc + no_leg,
    plt_read_success_abc + no_leg,
    plt_read_allele_abc,
    ncol = 1,
    align = "v",
    axis = "lr",
    rel_heights = c(1,1,1.2),
    labels = LETTERS[1:3]
  ),
  plot_grid(
    plt_cell_accuracy_abc + no_leg,
    plt_cell_success_abc + no_leg,
    plt_cell_allele_abc,
    ncol = 1,
    align = "v",
    axis = "lr",
    rel_heights = c(1,1,1.2),
    labels = LETTERS[4:6]
  ),
  plot_grid(
    legend,
    legend,
    NA,
    ncol = 1
  ),
  nrow = 1,
  rel_widths = c(1,1,0.25)
)
```

```{r, fig.width = 18, fig.height = 7, message = F, warning = F}
no_leg <- list(theme(legend.position = "none"))
legend <- cowplot::get_legend(plt_read_accuracy_all)
plot_grid(
  plot_grid(
    plt_read_accuracy_all + no_leg,
    plt_read_success_all + no_leg,
    plt_read_allele_all,
    ncol = 1,
    align = "v",
    axis = "lr",
    rel_heights = c(1,1,1.2),
    labels = LETTERS[1:3]
  ),
  plot_grid(
    plt_cell_accuracy_all + no_leg,
    plt_cell_success_all + no_leg,
    plt_cell_allele_all,
    ncol = 1,
    align = "v",
    axis = "lr",
    rel_heights = c(1,1,1.2),
    labels = LETTERS[4:6]
  ),
  plot_grid(
    legend,
    legend,
    NA,
    ncol = 1
  ),
  nrow = 1,
  rel_widths = c(1,1,0.25)
)
```

# Slope stats

```{r}
get_lm_stats <- function(df, x_var = "coverage", y_var = "accuracy", field_var = "field_2", x_log = T){
  # Input checks
  arg_col <- makeAssertCollection()
  assertChoice(y_var, c("accuracy", "success", "n_alleles"), add = arg_col)
  assertChoice(x_var, c("reads", "coverage", "n_cells"), add = arg_col)
  assertChoice(field_var, c("field_1", "field_2", "field_3"), add = arg_col)
  assertLogical(x_log, add = arg_col)
  if (arg_col$isEmpty()==F) {map(arg_col$getMessages(),print);reportAssertions(arg_col)}
  
  # browser()
  df <- df %>% 
    filter(field == field_var) %>% 
    remove_invalid_combinations()
  if (x_log ==T){df <- df %>% mutate_at(x_var, log10)}
  df %>%
  filter_at(x_var, function(x) !is.na(x) & !is.infinite(x)) %>% 
  group_by(locus, field, genotyper) %>%
  nest() %>%
  mutate(data = map(data, function(x){
    summary(lm(!!sym(y_var) ~ !!sym(x_var), data = x)) %>% broom::tidy() %>% filter(term == x_var)
  })) %>% unnest_wider(data) %>%
  mutate(est_min = round(estimate - 2*std.error, 2),
         est_max = round(estimate + 2*std.error, 2))
}

model_stats <- get_lm_stats(subsample_cells_combined_df, x_var = "reads", y_var = "accuracy", x_log = T)
model_stats
```

```{r}
model_stats <- bind_rows(
  get_lm_stats(subsample_reads_combined_df, x_var = "reads", y_var = "accuracy", x_log = T) %>% mutate(x_var = "reads", y_var = "accuracy"),
  get_lm_stats(subsample_reads_combined_df, x_var = "reads", y_var = "success", x_log = T) %>% mutate(x_var = "reads", y_var = "success"),
  get_lm_stats(subsample_cells_combined_df, x_var = "n_cells", y_var = "accuracy", x_log = T) %>% mutate(x_var = "cells", y_var = "accuracy"),
  get_lm_stats(subsample_cells_combined_df, x_var = "n_cells", y_var = "success", x_log = T) %>% mutate(x_var = "cells", y_var = "success")
)  %>% 
  # filter(grepl("^[ABC]", locus)) %>% 
  ungroup() %>% 
  filter(field == "field_2") 
```



# Table
```{r}
col_vars <- c("x_var", "locus")
row_vars <- c("y_var", "genotyper")

flex_df <- model_stats %>% 
  # filter(grepl("^[ABC]", locus)) %>% 
  ungroup() %>% 
  filter(field == "field_2") %>% 
  mutate(CI = sprintf("(%s) - (%s)", est_min, est_max)) %>% 
  mutate(genotyper = reformat_hla_genotyper(genotyper)) %>% 
  select(genotyper, locus, CI, x_var, y_var) %>% 
  pivot_wider(names_from = col_vars, values_from = "CI", names_sep =  "-") 

df_key <- tibble(col_keys = names(flex_df)) %>% 
  filter(!(col_keys %in% c(col_vars, row_vars))) %>% 
  separate(col_keys, into = col_vars, sep = "-", remove = F) %>%
  mutate_at(vars(contains("genotyper")), reformat_hla_genotyper) %>% 
  arrange_at(col_vars) %>% 
  mutate_all(as.character) %>% 
  mutate_at(col_vars[!(col_vars %in% "locus")], str_to_sentence)

flex_df %>% 
  select(row_vars, everything()) %>% 
  mutate_at(row_vars[!(row_vars %in% "genotyper")], str_to_sentence) %>% 
  # select(locus, df_key$col_keys) %>% 
  flextable() %>% 
  colformat_char(na_str = "---") %>%
  merge_v(j=1:2) %>% 
  set_header_df(mapping = df_key, key = "col_keys") %>% 
  merge_h(part = "header") %>%
  theme_vanilla() %>% 
  # vline(j=vlines_sequence, border = fp_border_default()) %>%
  fix_border_issues() %>% 
  autofit() %>% 
  fontsize(size = 8, part = "all") %>% 
  print(preview = "pptx")
```


```{r}
  df <- df %>% 
    mutate(field = reformat_hla_field(field)) %>% 
    mutate_at(vars(contains("genotyper")), reformat_hla_genotyper) %>% 
    mutate(cell_value = ifelse(!is.na(se), 
                               sprintf("%s %s %s", round(mean_accuracy,2),"\u00B1",round(se,2)),
                               sprintf("%s", round(mean_accuracy,2)))) %>%
    mutate(cell_value = ifelse(grepl("NA", cell_value), NA, cell_value)) %>% 
    select(-sd,-se, -mean_accuracy) %>% 
    pivot_wider(names_from = nesting_vars, values_from = "cell_value", names_sep = "-") 
  
  df_key <- suppressWarnings({tibble(col_keys = names(df)) %>% 
      filter(!grepl("locus", col_keys)) %>% 
      separate(col_keys, into = nesting_vars, sep = "-", remove = F) %>%
      mutate_at(vars(contains("genotyper")), reformat_hla_genotyper) %>% 
      arrange_at(nesting_vars) %>% 
      mutate_all(as.character)
  })
  
  vline_var <- tail(nesting_vars,1)
  vlines_sequence <- seq(1,nrow(df_key),by = length(unique(df_key[[vline_var]])))
  df <- df %>% 
    select(locus, df_key$col_keys) %>% 
    flextable() %>% 
    colformat_char(na_str = "---") %>% 
    set_header_df(mapping = df_key, key = "col_keys") %>% 
    merge_h(part = "header") %>% 
    theme_vanilla() %>% 
    vline(j=vlines_sequence, border = fp_border_default()) %>%
    fix_border_issues()
  return(df)
```





# Scratch work

```{r, fig.width=12, fig.height=3, warning = F, message = F}
subsample_cells_combined_df %>% 
  filter(field == "field_2") %>% 
  mutate(n_alleles = factor(n_alleles, levels = 0:2)) %>% 
  ggplot(aes(x = n_alleles, y = log10(n_cells)))+
  geom_violin(scale = "width")+
  geom_jitter(size = 0.2, height = 0, width =0.05, alpha = 0.2)+
  # geom_boxplot(width=0.1)+
  # ggbeeswarm::geom_beeswarm(size = 0.1, alpha = 0.1)+
  coord_flip()+
  facet_grid(genotyper ~ locus, scales = "free")+
  theme_bw()
```
# Coverage stats
```{r, fig.width=12, fig.height=3, warning = F, message = F}
subsample_cells_combined_df %>% 
  ggplot(aes(x = n_cells, y = coverage, color = genotyper)) + 
  geom_point() +
  facet_wrap(genotyper ~ locus, scales = "free", nrow = 3) + 
  theme_bw() +
  theme(strip.text = element_blank(),
        axis.text.x = element_text(angle = 45, hjust = 1))+
  scale_color_brewer(palette = "Dark2")
```

```{r, fig.width=16, fig.height=3, warning = F, message = F}
subsample_cells_combined_df %>% 
  select(-genotyper) %>% distinct() %>% 
  mutate(cov_per_cell = coverage/n_cells) %>% 
  ggplot(aes(x = log10(n_cells), y = cov_per_cell)) + 
  geom_point(color = "black", alpha = 0.1) +
  stat_smooth(method = "lm")+
  # stat_smooth()+
    facet_wrap( ~ locus, scales = "free", nrow = 1) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  scale_color_brewer(palette = "Dark2")
```

```{r, fig.width=16, fig.height=6, warning = F, message = F}
subsample_cells_combined_df %>% 
  select(-genotyper) %>% distinct() %>% 
  filter(!is.na(n_cells)) %>% 
  mutate(cov_per_cell = coverage/n_cells) %>% 
  mutate(n_cells = cut(n_cells, seq(0,10000,by=2500), right = F)) %>% 
  ggplot(aes(x = n_cells, y = cov_per_cell)) + 
  stat_summary(fun = function(x) mean(x,na.rm=T), geom = "bar") +
  # geom_point(color = "black", alpha = 0.1) +
  # stat_smooth(method = "lm")+
  # stat_smooth()+
  facet_wrap( ~ locus, scales = "free", nrow = 1) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  scale_color_brewer(palette = "Dark2")
```

```{r}
hist(cut(1:100, seq(1,100,10)))
```

```{r}
subsample_cells_combined_df %>% 
  ungroup() %>% 
  count(genotyper, locus, field)
```
```{r}
subsample_reads_combined_df %>% 
  ungroup() %>% 
  count(genotyper, locus, field)
```
